# --------------------------------
# setup
# --------------------------------

# libraries
library(dplyr)
library(labelProp)
library(text2vec)
library(pbapply)
library(readxl)

# --------------------------------
# load data
# --------------------------------

# granier data
granier_data <- readRDS("data/outputs/granier_data.rds")
granier_data$text <- gsub("[^a-zA-Z]", " ", granier_data$text)  # keep only text

# stopwords
stopwords_es <- readRDS("data/outputs/stopwords_es.rds")

# pre-trained glove word embeddings
glove <- readRDS("data/inputs/glove_spanish.rds")

# --------------------------------
# create and prune vocab
# --------------------------------

# tokenize
tokens = space_tokenizer(granier_data$text)
it = itoken(tokens, progressbar = FALSE)
vocab_tibble = create_vocabulary(it, stopwords = stopwords_es)

# check what proportion of candidate_vocab has a word embedding
table(vocab_tibble$term %in% rownames(glove))
candidate_vocab <- intersect(unique(vocab_tibble$term), rownames(glove))

# subset pre-trained embeddings to candidate vocab
glove_subset <- glove[intersect(rownames(glove), candidate_vocab),]

# --------------------------------
# compute transition matrix
# --------------------------------
# build a transition matrix
transition_matrix <- build_transition_matrix(x = glove_subset, threads = 6L)

# define seeds (labeled nodes)
seeds = list(pobreza = "pobreza", constituyente = "constituyente")

# propagate label using rw algorithm
set.seed(2022L)
rw_labels <- labelProp(x = transition_matrix, seeds = seeds, method = "rw", bootstrap = FALSE, permute = TRUE, num_permutations = 100, beta = 0.5, softmax = FALSE)

# save output
rw_labels$pobreza %>% filter(p.value < 0.05) %>% arrange(-score) %>% slice(1:20) %>% openxlsx::write.xlsx("data/outputs/nns_pobreza_all.xlsx")
rw_labels$constituyente %>% filter(p.value < 0.05) %>% arrange(-score) %>% slice(1:20) %>% openxlsx::write.xlsx("data/outputs/nns_constituyente_all.xlsx")



